<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
<head>
  <meta charset="utf-8">
  <meta http-equiv="X-UA-Compatible" content="IE=edge">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
  
  <link rel="shortcut icon" href="../../img/favicon.ico">
  <title>文本预处理 - Keras 中文文档</title>
  <link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>

  <link rel="stylesheet" href="../../css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../css/theme_extra.css" type="text/css" />
  <link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/styles/github.min.css">
  
  <script>
    // Current page data
    var mkdocs_page_name = "\u6587\u672c\u9884\u5904\u7406";
    var mkdocs_page_input_path = "preprocessing/text.md";
    var mkdocs_page_url = "/zh/preprocessing/text/";
  </script>
  
  <script src="../../js/jquery-2.1.1.min.js" defer></script>
  <script src="../../js/modernizr-2.8.3.min.js" defer></script>
  <script src="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/highlight.min.js"></script>
  <script>hljs.initHighlightingOnLoad();</script> 
  
  <script>
      (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
      (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
      m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
      })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');

      ga('create', 'UA-61785484-1', 'keras.io');
      ga('send', 'pageview');
  </script>
  
</head>

<body class="wy-body-for-nav" role="document">

  <div class="wy-grid-for-nav">

    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
      <div class="wy-side-nav-search">
        <a href="../.." class="icon icon-home"> Keras 中文文档</a>
        <div role="search">
  <form id ="rtd-search-form" class="wy-form" action="../../search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" title="Type search term here" />
  </form>
</div>
      </div>

      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
	<ul class="current">
	  
          
            <li class="toctree-l1">
		
    <a class="" href="../..">主页</a>
	    </li>
          
            <li class="toctree-l1">
		
    <a class="" href="../../why-use-keras/">为什么选择 Keras?</a>
	    </li>
          
            <li class="toctree-l1">
		
    <span class="caption-text">快速开始</span>
    <ul class="subnav">
                <li class="">
                    
    <a class="" href="../../getting-started/sequential-model-guide/">Sequential 顺序模型指引</a>
                </li>
                <li class="">
                    
    <a class="" href="../../getting-started/functional-api-guide/">函数式 API 指引</a>
                </li>
                <li class="">
                    
    <a class="" href="../../getting-started/faq/">FAQ 常见问题解答</a>
                </li>
    </ul>
	    </li>
          
            <li class="toctree-l1">
		
    <span class="caption-text">模型</span>
    <ul class="subnav">
                <li class="">
                    
    <a class="" href="../../models/about-keras-models/">关于 Keras 模型</a>
                </li>
                <li class="">
                    
    <a class="" href="../../models/sequential/">Sequential 顺序模型 API</a>
                </li>
                <li class="">
                    
    <a class="" href="../../models/model/">函数式 API</a>
                </li>
    </ul>
	    </li>
          
            <li class="toctree-l1">
		
    <span class="caption-text">Layers</span>
    <ul class="subnav">
                <li class="">
                    
    <a class="" href="../../layers/about-keras-layers/">关于 Keras 网络层</a>
                </li>
                <li class="">
                    
    <a class="" href="../../layers/core/">核心网络层</a>
                </li>
                <li class="">
                    
    <a class="" href="../../layers/convolutional/">卷积层 Convolutional</a>
                </li>
                <li class="">
                    
    <a class="" href="../../layers/pooling/">池化层 Pooling</a>
                </li>
                <li class="">
                    
    <a class="" href="../../layers/local/">局部连接层 Locally-connected</a>
                </li>
                <li class="">
                    
    <a class="" href="../../layers/recurrent/">循环层 Recurrent</a>
                </li>
                <li class="">
                    
    <a class="" href="../../layers/embeddings/">嵌入层 Embedding</a>
                </li>
                <li class="">
                    
    <a class="" href="../../layers/merge/">融合层 Merge</a>
                </li>
                <li class="">
                    
    <a class="" href="../../layers/advanced-activations/">高级激活层 Advanced Activations</a>
                </li>
                <li class="">
                    
    <a class="" href="../../layers/normalization/">标准化层 Normalization</a>
                </li>
                <li class="">
                    
    <a class="" href="../../layers/noise/">噪声层 Noise</a>
                </li>
                <li class="">
                    
    <a class="" href="../../layers/wrappers/">层封装器 wrappers</a>
                </li>
                <li class="">
                    
    <a class="" href="../../layers/writing-your-own-keras-layers/">编写你自己的层</a>
                </li>
    </ul>
	    </li>
          
            <li class="toctree-l1">
		
    <span class="caption-text">数据预处理</span>
    <ul class="subnav">
                <li class="">
                    
    <a class="" href="../sequence/">序列预处理</a>
                </li>
                <li class=" current">
                    
    <a class="current" href="./">文本预处理</a>
    <ul class="subnav">
            
    <li class="toctree-l3"><a href="#text-preprocessing">Text Preprocessing</a></li>
    

    <li class="toctree-l3"><a href="#tokenizer">Tokenizer</a></li>
    

    <li class="toctree-l3"><a href="#hashing_trick">hashing_trick</a></li>
    

    <li class="toctree-l3"><a href="#one_hot">one_hot</a></li>
    

    <li class="toctree-l3"><a href="#text_to_word_sequence">text_to_word_sequence</a></li>
    

    </ul>
                </li>
                <li class="">
                    
    <a class="" href="../image/">图像预处理</a>
                </li>
    </ul>
	    </li>
          
            <li class="toctree-l1">
		
    <a class="" href="../../losses/">损失函数 Losses</a>
	    </li>
          
            <li class="toctree-l1">
		
    <a class="" href="../../metrics/">评估标准 Metrics</a>
	    </li>
          
            <li class="toctree-l1">
		
    <a class="" href="../../optimizers/">优化器 Optimizers</a>
	    </li>
          
            <li class="toctree-l1">
		
    <a class="" href="../../activations/">激活函数 Activations</a>
	    </li>
          
            <li class="toctree-l1">
		
    <a class="" href="../../callbacks/">回调函数 Callbacks</a>
	    </li>
          
            <li class="toctree-l1">
		
    <a class="" href="../../datasets/">常用数据集 Datasets</a>
	    </li>
          
            <li class="toctree-l1">
		
    <a class="" href="../../applications/">应用 Applications</a>
	    </li>
          
            <li class="toctree-l1">
		
    <a class="" href="../../backend/">后端 Backend</a>
	    </li>
          
            <li class="toctree-l1">
		
    <a class="" href="../../initializers/">初始化 Initializers</a>
	    </li>
          
            <li class="toctree-l1">
		
    <a class="" href="../../regularizers/">正则化 Regularizers</a>
	    </li>
          
            <li class="toctree-l1">
		
    <a class="" href="../../constraints/">约束 Constraints</a>
	    </li>
          
            <li class="toctree-l1">
		
    <a class="" href="../../visualization/">可视化 Visualization</a>
	    </li>
          
            <li class="toctree-l1">
		
    <a class="" href="../../scikit-learn-api/">Scikit-learn API</a>
	    </li>
          
            <li class="toctree-l1">
		
    <a class="" href="../../utils/">工具</a>
	    </li>
          
            <li class="toctree-l1">
		
    <a class="" href="../../contributing/">贡献</a>
	    </li>
          
            <li class="toctree-l1">
		
    <span class="caption-text">经典样例</span>
    <ul class="subnav">
                <li class="">
                    
    <a class="" href="../../examples/addition_rnn/">Addition RNN</a>
                </li>
                <li class="">
                    
    <a class="" href="../../examples/babi_rnn/">Baby RNN</a>
                </li>
                <li class="">
                    
    <a class="" href="../../examples/babi_memnn/">Baby MemNN</a>
                </li>
                <li class="">
                    
    <a class="" href="../../examples/cifar10_cnn/">CIFAR-10 CNN</a>
                </li>
                <li class="">
                    
    <a class="" href="../../examples/cifar10_cnn_capsule/">CIFAR-10 CNN-Capsule</a>
                </li>
                <li class="">
                    
    <a class="" href="../../examples/cifar10_cnn_tfaugment2d/">CIFAR-10 CNN with augmentation (TF)</a>
                </li>
                <li class="">
                    
    <a class="" href="../../examples/cifar10_resnet/">CIFAR-10 ResNet</a>
                </li>
                <li class="">
                    
    <a class="" href="../../examples/conv_filter_visualization/">Convolution filter visualization</a>
                </li>
                <li class="">
                    
    <a class="" href="../../examples/image_ocr/">Image OCR</a>
                </li>
                <li class="">
                    
    <a class="" href="../../examples/imdb_bidirectional_lstm/">Bidirectional LSTM</a>
                </li>
    </ul>
	    </li>
          
        </ul>
      </div>
      &nbsp;
    </nav>

    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">

      
      <nav class="wy-nav-top" role="navigation" aria-label="top navigation">
        <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
        <a href="../..">Keras 中文文档</a>
      </nav>

      
      <div class="wy-nav-content">
        <div class="rst-content">
          <div role="navigation" aria-label="breadcrumbs navigation">
  <ul class="wy-breadcrumbs">
    <li><a href="../..">Docs</a> &raquo;</li>
    
      
        
          <li>数据预处理 &raquo;</li>
        
      
    
    <li>文本预处理</li>
    <li class="wy-breadcrumbs-aside">
      
        <a href="https://github.com/keras-team/keras-docs-zh/edit/master/docs/preprocessing/text.md"
          class="icon icon-github"> Edit on GitHub</a>
      
    </li>
  </ul>
  <hr/>
</div>
          <div role="main">
            <div class="section">
              
                <h3 id="text-preprocessing">Text Preprocessing</h3>
<p><span style="float:right;"><a href="https://github.com/keras-team/keras/blob/master/keras/preprocessing/text.py#L138">[source]</a></span></p>
<h3 id="tokenizer">Tokenizer</h3>
<pre><code class="python">keras.preprocessing.text.Tokenizer(num_words=None, 
                                   filters='!&quot;#$%&amp;()*+,-./:;&lt;=&gt;?@[\]^_`{|}~ ', 
                                   lower=True, 
                                   split=' ', 
                                   char_level=False, 
                                   oov_token=None, 
                                   document_count=0)
</code></pre>

<p>文本标记实用类。</p>
<p>该类允许使用两种方法向量化一个文本语料库：
将每个文本转化为一个整数序列（每个整数都是词典中标记的索引）；
或者将其转化为一个向量，其中每个标记的系数可以是二进制值、词频、TF-IDF权重等。</p>
<p><strong>参数</strong></p>
<ul>
<li><strong>num_words</strong>: 需要保留的最大词数，基于词频。只有最常出现的 <code>num_words</code> 词会被保留。</li>
<li><strong>filters</strong>: 一个字符串，其中每个元素是一个将从文本中过滤掉的字符。默认值是所有标点符号，加上制表符和换行符，减去 <code>'</code> 字符。</li>
<li><strong>lower</strong>: 布尔值。是否将文本转换为小写。</li>
<li><strong>split</strong>: 字符串。按该字符串切割文本。</li>
<li><strong>char_level</strong>: 如果为 True，则每个字符都将被视为标记。</li>
<li><strong>oov_token</strong>: 如果给出，它将被添加到 word_index 中，并用于在 <code>text_to_sequence</code> 调用期间替换词汇表外的单词。</li>
</ul>
<p>默认情况下，删除所有标点符号，将文本转换为空格分隔的单词序列（单词可能包含 <code>'</code> 字符）。
这些序列然后被分割成标记列表。然后它们将被索引或向量化。</p>
<p><code>0</code> 是不会被分配给任何单词的保留索引。</p>
<hr />
<h3 id="hashing_trick">hashing_trick</h3>
<pre><code class="python">keras.preprocessing.text.hashing_trick(text, n,
                                       hash_function=None, 
                                       filters='!&quot;#$%&amp;()*+,-./:;&lt;=&gt;?@[\]^_`{|}~ ', lower=True, 
                                       split=' ')
</code></pre>

<p>将文本转换为固定大小散列空间中的索引序列。</p>
<p><strong>参数</strong></p>
<ul>
<li><strong>text</strong>: 输入文本（字符串）。</li>
<li><strong>n</strong>: 散列空间维度。</li>
<li><strong>hash_function</strong>: 默认为 python 散列函数，可以是 'md5' 或任意接受输入字符串并返回整数的函数。注意 'hash' 不是稳定的散列函数，所以它在不同的运行中不一致，而 'md5' 是一个稳定的散列函数。</li>
<li><strong>filters</strong>: 要过滤的字符列表（或连接），如标点符号。默认：<code>!"#$%&amp;()*+,-./:;&lt;=&gt;?@[\]^_{|}~</code>，包含基本标点符号，制表符和换行符。</li>
<li><strong>lower</strong>: 布尔值。是否将文本转换为小写。</li>
<li><strong>split</strong>: 字符串。按该字符串切割文本。</li>
</ul>
<p><strong>返回</strong></p>
<p>整数词索引列表（唯一性无法保证）。</p>
<p><code>0</code> 是不会被分配给任何单词的保留索引。</p>
<p>由于哈希函数可能发生冲突，可能会将两个或更多字分配给同一索引。
碰撞的<a href="https://en.wikipedia.org/wiki/Birthday_problem#Probability_table">概率</a>与散列空间的维度和不同对象的数量有关。</p>
<hr />
<h3 id="one_hot">one_hot</h3>
<pre><code class="python">keras.preprocessing.text.one_hot(text, n, 
                                 filters='!&quot;#$%&amp;()*+,-./:;&lt;=&gt;?@[\]^_`{|}~', 
                                 lower=True, 
                                 split=' ')
</code></pre>

<p>One-hot 将文本编码为大小为 n 的单词索引列表。</p>
<p>这是 <code>hashing_trick</code> 函数的一个封装，
使用 <code>hash</code> 作为散列函数；单词索引映射无保证唯一性。</p>
<p><strong>参数</strong></p>
<ul>
<li><strong>text</strong>: 输入文本（字符串）。</li>
<li><strong>n</strong>: 整数。词汇表尺寸。</li>
<li><strong>filters</strong>: 要过滤的字符列表（或连接），如标点符号。默认：<code>!"#$%&amp;()*+,-./:;&lt;=&gt;?@[\]^_{|}~</code>，包含基本标点符号，制表符和换行符。</li>
<li><strong>lower</strong>: 布尔值。是否将文本转换为小写。</li>
<li><strong>split</strong>: 字符串。按该字符串切割文本。</li>
</ul>
<p><strong>返回</strong></p>
<p>[1, n] 之间的整数列表。每个整数编码一个词（唯一性无法保证）。</p>
<hr />
<h3 id="text_to_word_sequence">text_to_word_sequence</h3>
<pre><code class="python">keras.preprocessing.text.text_to_word_sequence(text, 
                                               filters='!&quot;#$%&amp;()*+,-./:;&lt;=&gt;?@[\]^_`{|}~ ', 
                                               lower=True, 
                                               split=' ')
</code></pre>

<p>将文本转换为单词（或标记）的序列。</p>
<p><strong>参数</strong></p>
<ul>
<li><strong>text</strong>: 输入文本（字符串）。</li>
<li><strong>filters</strong>: 要过滤的字符列表（或连接），如标点符号。默认：<code>!"#$%&amp;()*+,-./:;&lt;=&gt;?@[\]^_{|}~</code>，包含基本标点符号，制表符和换行符。</li>
<li><strong>lower</strong>: 布尔值。是否将文本转换为小写。</li>
<li><strong>split</strong>: 字符串。按该字符串切割文本。</li>
</ul>
<p><strong>返回</strong></p>
<p>词或标记的列表。</p>
              
            </div>
          </div>
          <footer>
  
    <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
      
        <a href="../image/" class="btn btn-neutral float-right" title="图像预处理">Next <span class="icon icon-circle-arrow-right"></span></a>
      
      
        <a href="../sequence/" class="btn btn-neutral" title="序列预处理"><span class="icon icon-circle-arrow-left"></span> Previous</a>
      
    </div>
  

  <hr/>

  <div role="contentinfo">
    <!-- Copyright etc -->
    
  </div>

  Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
      
        </div>
      </div>

    </section>

  </div>

  <div class="rst-versions" role="note" style="cursor: pointer">
    <span class="rst-current-version" data-toggle="rst-current-version">
      
          <a href="https://github.com/keras-team/keras-docs-zh/" class="fa fa-github" style="float: left; color: #fcfcfc"> GitHub</a>
      
      
        <span><a href="../sequence/" style="color: #fcfcfc;">&laquo; Previous</a></span>
      
      
        <span style="margin-left: 15px"><a href="../image/" style="color: #fcfcfc">Next &raquo;</a></span>
      
    </span>
</div>
    <script>var base_url = '../..';</script>
    <script src="../../js/theme.js" defer></script>
      <script src="../../search/main.js" defer></script>

</body>
</html>
